library(data.table)
library(rmarkdown)
library(AnnotationHub)
library(tidyverse)
library(tximport)
library(ggplot2)
library(DESeq2)
library(gridExtra)
library(pheatmap)
library(UpSetR)
library(ensembldb)
library(apeglm)
library(ashr)
DB <- "EnsDb" # Assing your species
AnnotationSpecies <- "mus musculus" # Assign your species
ah <- AnnotationHub(hub=getAnnotationHubOption("URL")) # Connect to annotation DB
# Filter annotation of interest
ahQuery <- query(ah,
pattern=c(DB, AnnotationSpecies),
ignore.case=T)
# Select the most recent data
DBName <- mcols(ahQuery) %>%
rownames() %>%
tail(1)
AnnoDb <- ah[[DBName]]
# Explore your EnsDb object with following accessors:
# columns(AnnpDb)
# keytypes(AnnoDb)
# keys(AnnoDb, keytype=..)
# select(AnnoDb, keys=.., columns=.., keytype=...)
AnnoKey <- keys(AnnoDb, keytype="TXID")
# Note: Annotation has to be done with not genome but transcripts
AnnoDb <- select(AnnoDb,
AnnoKey,
keytype="TXID",
columns=c("GENEID", "GENENAME"))
# Check if your AnnoDb has been extracted and saved correctely
class(AnnoDb)
## [1] "data.frame"
head(AnnoDb)
## TXID GENEID GENENAME
## 1 ENSMUST00000000001 ENSMUSG00000000001 Gnai3
## 2 ENSMUST00000000003 ENSMUSG00000000003 Pbsn
## 3 ENSMUST00000000010 ENSMUSG00000020875 Hoxb9
## 4 ENSMUST00000000028 ENSMUSG00000000028 Cdc45
## 5 ENSMUST00000000033 ENSMUSG00000048583 Igf2
## 6 ENSMUST00000000049 ENSMUSG00000000049 Apoh
.sf files have been created from fastq data by salmon
# This code chunk needs to be written by yourself
#
# Define sample names
SampleNames <- c(paste0("WT-rep", 1:3), paste0("cKO-rep", 1:3))
# Define group level
GroupLevel <- c("WT", "cKO")
# Define contrast for DE analysis
Contrast <- c("Group", "cKO", "WT")
# Define a vector for comparing TPM vs Counts effect
TvC <- c("TPM", "Counts")
levels(TvC) <- TvC
# Define .sf file path
sf <- c(paste0("salmon_output/",
SampleNames,
".salmon_quant/quant.sf"))
# Define sample groups
group <- c(rep(GroupLevel[1], 3), rep(GroupLevel[2], 3))
# Create metadata
metadata <- data.frame(Sample=factor(SampleNames, levels=SampleNames),
Group=factor(group, levels=GroupLevel),
Path=sf)
rownames(metadata) <- SampleNames
# Explore the metadata
print(metadata)
## Sample Group Path
## WT-rep1 WT-rep1 WT salmon_output/WT-rep1.salmon_quant/quant.sf
## WT-rep2 WT-rep2 WT salmon_output/WT-rep2.salmon_quant/quant.sf
## WT-rep3 WT-rep3 WT salmon_output/WT-rep3.salmon_quant/quant.sf
## cKO-rep1 cKO-rep1 cKO salmon_output/cKO-rep1.salmon_quant/quant.sf
## cKO-rep2 cKO-rep2 cKO salmon_output/cKO-rep2.salmon_quant/quant.sf
## cKO-rep3 cKO-rep3 cKO salmon_output/cKO-rep3.salmon_quant/quant.sf
# Assign sample names to the input (.sf) file path
names(sf) <- SampleNames
# Run tximport
# tpm vs original counts
# input sf: a factor of all .sf files' path
txi_tpm <- tximport(sf,
type="salmon",
tx2gene=AnnoDb,
countsFromAbundance="lengthScaledTPM", # Extracts TPM
ignoreTxVersion=T)
txi_counts <- tximport(sf,
type="salmon",
tx2gene=AnnoDb,
ignoreTxVersion=T)
# tpm
head(txi_tpm$counts)
## WT-rep1 WT-rep2 WT-rep3 cKO-rep1 cKO-rep2
## ENSMUSG00000000001 6577.34422 5799.47405 5642.76617 5995.7310672 6077.731462
## ENSMUSG00000000003 0.00000 0.00000 0.00000 0.0000000 0.000000
## ENSMUSG00000000028 3920.55098 3550.15001 3714.90523 3421.9627713 3407.169948
## ENSMUSG00000000031 25.55295 61.50787 45.36071 9.5017579 38.695366
## ENSMUSG00000000037 85.39075 75.78040 107.66889 138.5407492 170.159821
## ENSMUSG00000000049 0.00000 0.00000 0.00000 0.9981564 1.995204
## cKO-rep3
## ENSMUSG00000000001 5349.19912
## ENSMUSG00000000003 0.00000
## ENSMUSG00000000028 3121.53476
## ENSMUSG00000000031 95.49668
## ENSMUSG00000000037 137.19658
## ENSMUSG00000000049 0.00000
dim(txi_tpm$counts)
## [1] 54347 6
# counts
head(txi_counts$counts)
## WT-rep1 WT-rep2 WT-rep3 cKO-rep1 cKO-rep2 cKO-rep3
## ENSMUSG00000000001 6492.000 5822.000 5614.000 6040.00 6142.00 5437.000
## ENSMUSG00000000003 0.000 0.000 0.000 0.00 0.00 0.000
## ENSMUSG00000000028 3904.066 3567.989 3714.192 3410.46 3463.04 3142.454
## ENSMUSG00000000031 33.000 45.000 58.000 12.00 28.00 70.000
## ENSMUSG00000000037 119.001 90.001 82.000 165.00 129.00 100.000
## ENSMUSG00000000049 0.000 0.000 0.000 1.00 2.00 0.000
dim(txi_counts$counts)
## [1] 54347 6
# Set a function creating dds and vsd
dds_vsd_fn <- function(txi) {
# Create a DESeq object (so-calledd dds)
des <- DESeqDataSetFromTximport(txi,
colData=metadata,
design=~Group)
# Create a vsd object (so-called vsd)
ves <- vst(des, blind=T)
# Output them as a list
return(list(dds=des, vsd=ves))
}
TPM <- dds_vsd_fn(txi_tpm)
Counts <- dds_vsd_fn(txi_counts)
# Outputs
# dds: TPM/Counts[[1]] or TPM/Counts[['dds']]
# vsd: TPM/Counts[[2]] or TPM/Counts[['vsd']]
# TPM
TPM[[1]]
## class: DESeqDataSet
## dim: 54347 6
## metadata(1): version
## assays(1): counts
## rownames(54347): ENSMUSG00000000001 ENSMUSG00000000003 ...
## ENSMUSG00000118658 ENSMUSG00000118659
## rowData names(0):
## colnames(6): WT-rep1 WT-rep2 ... cKO-rep2 cKO-rep3
## colData names(3): Sample Group Path
head(counts(TPM[[1]]))
## WT-rep1 WT-rep2 WT-rep3 cKO-rep1 cKO-rep2 cKO-rep3
## ENSMUSG00000000001 6577 5799 5643 5996 6078 5349
## ENSMUSG00000000003 0 0 0 0 0 0
## ENSMUSG00000000028 3921 3550 3715 3422 3407 3122
## ENSMUSG00000000031 26 62 45 10 39 95
## ENSMUSG00000000037 85 76 108 139 170 137
## ENSMUSG00000000049 0 0 0 1 2 0
# Counts
Counts[[1]]
## class: DESeqDataSet
## dim: 54347 6
## metadata(1): version
## assays(2): counts avgTxLength
## rownames(54347): ENSMUSG00000000001 ENSMUSG00000000003 ...
## ENSMUSG00000118658 ENSMUSG00000118659
## rowData names(0):
## colnames(6): WT-rep1 WT-rep2 ... cKO-rep2 cKO-rep3
## colData names(3): Sample Group Path
head(counts(Counts[[1]]))
## WT-rep1 WT-rep2 WT-rep3 cKO-rep1 cKO-rep2 cKO-rep3
## ENSMUSG00000000001 6492 5822 5614 6040 6142 5437
## ENSMUSG00000000003 0 0 0 0 0 0
## ENSMUSG00000000028 3904 3568 3714 3410 3463 3142
## ENSMUSG00000000031 33 45 58 12 28 70
## ENSMUSG00000000037 119 90 82 165 129 100
## ENSMUSG00000000049 0 0 0 1 2 0
# TPM
TPM[[2]]
## class: DESeqTransform
## dim: 54347 6
## metadata(1): version
## assays(1): ''
## rownames(54347): ENSMUSG00000000001 ENSMUSG00000000003 ...
## ENSMUSG00000118658 ENSMUSG00000118659
## rowData names(4): baseMean baseVar allZero dispFit
## colnames(6): WT-rep1 WT-rep2 ... cKO-rep2 cKO-rep3
## colData names(4): Sample Group Path sizeFactor
# Counts
Counts[[2]]
## class: DESeqTransform
## dim: 54347 6
## metadata(1): version
## assays(1): ''
## rownames(54347): ENSMUSG00000000001 ENSMUSG00000000003 ...
## ENSMUSG00000118658 ENSMUSG00000118659
## rowData names(4): baseMean baseVar allZero dispFit
## colnames(6): WT-rep1 WT-rep2 ... cKO-rep2 cKO-rep3
## colData names(3): Sample Group Path
# Set a function estimating size factors, dispersions, and perform wald test
DESeqPrep_fn <- function(List) {
List[[1]] <- estimateSizeFactors(List[[1]])
List[[1]] <- estimateDispersions(List[[1]])
List[[1]] <- nbinomWaldTest(List[[1]])
return(List)
}
# Update dds with the function
Counts <- DESeqPrep_fn(Counts)
TPM <- DESeqPrep_fn(TPM)
sizeFactors(Counts[[1]])
## NULL
sizeFactors(TPM[[1]])
## WT-rep1 WT-rep2 WT-rep3 cKO-rep1 cKO-rep2 cKO-rep3
## 1.1105548 0.9995683 0.9771351 1.0201440 1.0317426 0.8932568
# Size factors don't exist in the Counts dds!
# Normalization factors are calculated in the Counts dds instead!
assays(Counts[[1]])
## List of length 6
## names(6): counts avgTxLength normalizationFactors mu H cooks
assays(TPM[[1]])
## List of length 4
## names(4): counts mu H cooks
colData(Counts[[1]])
## DataFrame with 6 rows and 3 columns
## Sample Group Path
## <factor> <factor> <character>
## WT-rep1 WT-rep1 WT salmon_output/WT-rep..
## WT-rep2 WT-rep2 WT salmon_output/WT-rep..
## WT-rep3 WT-rep3 WT salmon_output/WT-rep..
## cKO-rep1 cKO-rep1 cKO salmon_output/cKO-re..
## cKO-rep2 cKO-rep2 cKO salmon_output/cKO-re..
## cKO-rep3 cKO-rep3 cKO salmon_output/cKO-re..
colData(TPM[[1]])
## DataFrame with 6 rows and 4 columns
## Sample Group Path sizeFactor
## <factor> <factor> <character> <numeric>
## WT-rep1 WT-rep1 WT salmon_output/WT-rep.. 1.110555
## WT-rep2 WT-rep2 WT salmon_output/WT-rep.. 0.999568
## WT-rep3 WT-rep3 WT salmon_output/WT-rep.. 0.977135
## cKO-rep1 cKO-rep1 cKO salmon_output/cKO-re.. 1.020144
## cKO-rep2 cKO-rep2 cKO salmon_output/cKO-re.. 1.031743
## cKO-rep3 cKO-rep3 cKO salmon_output/cKO-re.. 0.893257
# Extract and save the size factors in a data frame
sizeFactor <- as.data.frame(round(sizeFactors(TPM[[1]]), 3))
colnames(sizeFactor) <- 'Size_Factor'
sizeFactor <- sizeFactor %>%
rownames_to_column(var="Sample") %>%
inner_join(metadata[, 1:ncol(metadata)-1], by="Sample")
sizeFactor$Sample <- factor(sizeFactor$Sample, levels=SampleNames)
# Create a plot comparing the size factors by sample
ggplot(sizeFactor, aes(x=Sample,
y=Size_Factor,
fill=Group,
label=Size_Factor)) +
geom_bar(stat="identity", width=0.8) +
theme_bw() +
ggtitle("Size Factors in TPM-DESeq") +
geom_text(vjust=1.5) +
theme(axis.text.x=element_text(angle=45,
vjust=0.5)) + ylab("Size Factor") + geom_hline(yintercept=1, color="blue", linetype="dashed")
# Extract and normalization factors in a data frame
normf <- as.data.frame(normalizationFactors(Counts[[1]])) %>%
gather(Sample, Normalization_Factor) %>%
inner_join(metadata[, 1:2], by="Sample")
normf$Sample <- factor(normf$Sample, levels=SampleNames)
normf$Group <- factor(normf$Group, levels=GroupLevel)
# Create a scatter plot showing distribution of normalization factors
normFactors_plot <- ggplot(normf,
aes(x=Sample, y=Normalization_Factor)) +
geom_jitter(alpha=0.5, aes(color=Group)) +
# Add a boxplot to provide statistics in each sample
geom_boxplot(aes(x=Sample, y=Normalization_Factor),
outlier.shape=NA, alpha=0.5) +
theme_bw() +
ggtitle("Normalization Factors in Counts-DESeq") +
theme(axis.text.x=element_text(angle=45,
vjust=0.5)) +
ylab("Normalization Factor / Gene") +
# Add a dashed horizontal line to indicate where normalization factor=1
geom_hline(yintercept=1,
color="blue",
linetype="dashed")
# Print the normalization factor scatter plot
print(normFactors_plot)
# Print the same plot with larger y-magnification in order to observe the box plot
normFactors_plot +
ylim(0.8, 1.2)
# Assigne what to compare
GroupOfInterest <- Contrast[1]
# Set a function for a PCA plot
QCPCA_fn <- function(inputList, Title) {
plotPCA(inputList[[2]], # takes vsd
intgroup=GroupOfInterest) + theme_bw() + ggtitle(Title)
}
# Set heatmap annotation
ColOfInterest <- !colnames(metadata) %in% c("Sample", "Path")
HeatmapAnno <- as.data.frame(metadata[, ColOfInterest])
rownames(HeatmapAnno) <- SampleNames
colnames(HeatmapAnno) <- colnames(metadata)[ColOfInterest]
# Set a function for a correlation heatmap
QCcorrHeatmap_fn <- function(inputList, Title) {
# Extract transformed count matrix
mtx <- assay(inputList[[2]]) # takes vsd
# Calculate correlation and store in the matrix
mtx <- cor(mtx)
# Create a correlation heatmap
return(pheatmap(mtx,
annotation=HeatmapAnno,
main=paste("Sample Correlation Heatmap:",
Title)))
}
grid.arrange(QCPCA_fn(TPM, "QC PCA: TPM"),
QCPCA_fn(Counts, "QC PCA: Counts"),
nrow=2)
# TPM
QCcorrHeatmap_fn(TPM, "TPM")
# Counts
QCcorrHeatmap_fn(Counts, "Counts")
# Create a list consisted with dds objects from TPM and Counts
desList <- list(TPM[[1]], Counts[[1]])
names(desList) <- TvC
# Create a list for TPM and Counts dds
ddsList <- desList # DE without shrinkage
normal.ddsList <- desList # DE with "normal" shrinkage
ape.ddsList <- desList # DE with "apeglm" shrinkage
ash.ddsList <- desList # DE with "ashr" shrinkage
for (x in TvC) {
# Run DESeq()
ddsList[[x]] <- DESeq(desList[[x]])
print(resultsNames(ddsList[[x]]))
normal.ddsList[[x]] <- lfcShrink(ddsList[[x]],
contrast=Contrast,
type="normal")
ape.ddsList[[x]] <- lfcShrink(ddsList[[x]],
coef=resultsNames(ddsList[[x]])[2],
type="apeglm")
ash.ddsList[[x]] <- lfcShrink(ddsList[[x]],
coef=resultsNames(ddsList[[x]])[2],
type="ashr")
}
## [1] "Intercept" "Group_cKO_vs_WT"
## [1] "Intercept" "Group_cKO_vs_WT"
# Combine every ddsList into a list
all.ddsList <- list(ddsList, normal.ddsList, ape.ddsList, ash.ddsList)
shrinkage <- c("None", "Normal", "Apeglm", "Ashr")
names(all.ddsList) <- shrinkage
# Plot dispersion
for (x in TvC) {
plotDispEsts(ddsList[[x]],
ylab="Dispersion",
xlab="Mean of Normalized Counts",
main=paste("Dispersion of", x, "Input"))
}
# Set FDR threshold
alpha=0.1
# FDR threshold vector
FDRv=c("< 0.1", "> 0.1")
# Initialize lists of result tables
all.resList <- all.ddsList
# Set a function cleaning table
Sig.fn <- function(df, Input) {
df <- df %>%
rownames_to_column(var="Gene") %>%
mutate(FDR=ifelse(padj < 0.1 & !is.na(padj),
FDRv[1],
FDRv[2]),
Input=Input)
return(df)
}
for (i in shrinkage) {
if (i == "None") {
for (x in TvC) {
# Extract data frames from unshrunken lfc & clean data
all.resList[[i]][[x]] <- as.data.frame(results(all.ddsList[[i]][[x]],
contrast=Contrast,
alpha=alpha)) %>%
Sig.fn(x)
}
} else {
# Extract data frames from shrunken lfc & clean data
for (x in TvC) {
all.resList[[i]][[x]] <- as.data.frame(all.ddsList[[i]][[x]]) %>%
Sig.fn(x)
}
}
}
# Explore the results
summary(all.resList)
## Length Class Mode
## None 2 -none- list
## Normal 2 -none- list
## Apeglm 2 -none- list
## Ashr 2 -none- list
head(all.resList[[1]][['TPM']])
## Gene baseMean log2FoldChange lfcSE stat
## 1 ENSMUSG00000000001 5875.9369080 0.02099816 0.05786985 0.3628515
## 2 ENSMUSG00000000003 0.0000000 NA NA NA
## 3 ENSMUSG00000000028 3505.9693632 -0.10055360 0.07303168 -1.3768491
## 4 ENSMUSG00000000031 47.5744267 0.22519197 0.73238390 0.3074780
## 5 ENSMUSG00000000037 119.5824586 0.79057118 0.24424258 3.2368278
## 6 ENSMUSG00000000049 0.4864536 2.44892110 3.95081414 0.6198523
## pvalue padj FDR Input
## 1 0.716715835 0.94295890 > 0.1 TPM
## 2 NA NA > 0.1 TPM
## 3 0.168558919 0.59608179 > 0.1 TPM
## 4 0.758479529 0.95405920 > 0.1 TPM
## 5 0.001208663 0.03084828 < 0.1 TPM
## 6 0.535355054 NA > 0.1 TPM
head(all.resList[[1]][['Counts']])
## Gene baseMean log2FoldChange lfcSE stat
## 1 ENSMUSG00000000001 5914.7607747 0.02127119 0.05884416 0.3614834
## 2 ENSMUSG00000000003 0.0000000 NA NA NA
## 3 ENSMUSG00000000028 3528.9553327 -0.10034826 0.07366789 -1.3621709
## 4 ENSMUSG00000000031 45.9580425 0.22330899 0.74116522 0.3012945
## 5 ENSMUSG00000000037 116.2538103 0.80398885 0.24294058 3.3094053
## 6 ENSMUSG00000000049 0.4886878 2.45560171 3.86195822 0.6358437
## pvalue padj FDR Input
## 1 0.7177381103 0.9481132 > 0.1 Counts
## 2 NA NA > 0.1 Counts
## 3 0.1731439441 0.6154148 > 0.1 Counts
## 4 0.7631899485 0.9585726 > 0.1 Counts
## 5 0.0009349437 0.0258125 < 0.1 Counts
## 6 0.5248783168 NA > 0.1 Counts
head(all.resList[[2]][['TPM']])
## Gene baseMean log2FoldChange lfcSE stat
## 1 ENSMUSG00000000001 5875.9369080 0.02046530 0.05640137 0.3628515
## 2 ENSMUSG00000000003 0.0000000 NA NA NA
## 3 ENSMUSG00000000028 3505.9693632 -0.09654963 0.07012366 -1.3768491
## 4 ENSMUSG00000000031 47.5744267 0.04345855 0.14166307 0.3074780
## 5 ENSMUSG00000000037 119.5824586 0.53991562 0.16670298 3.2368278
## 6 ENSMUSG00000000049 0.4864536 0.02252178 0.03169874 0.6198523
## pvalue padj FDR Input
## 1 0.716715835 0.94295890 > 0.1 TPM
## 2 NA NA > 0.1 TPM
## 3 0.168558919 0.59608179 > 0.1 TPM
## 4 0.758479529 0.95405920 > 0.1 TPM
## 5 0.001208663 0.03084828 < 0.1 TPM
## 6 0.535355054 NA > 0.1 TPM
head(all.resList[[2]][['Counts']])
## Gene baseMean log2FoldChange lfcSE stat
## 1 ENSMUSG00000000001 5914.7607747 0.02071745 0.05731238 0.3614834
## 2 ENSMUSG00000000003 0.0000000 NA NA NA
## 3 ENSMUSG00000000028 3528.9553327 -0.09631344 0.07070590 -1.3621709
## 4 ENSMUSG00000000031 45.9580425 0.04230654 0.14143386 0.3012945
## 5 ENSMUSG00000000037 116.2538103 0.55120829 0.16681149 3.3094053
## 6 ENSMUSG00000000049 0.4886878 0.02375283 0.03262421 0.6358437
## pvalue padj FDR Input
## 1 0.7177381103 0.9481132 > 0.1 Counts
## 2 NA NA > 0.1 Counts
## 3 0.1731439441 0.6154148 > 0.1 Counts
## 4 0.7631899485 0.9585726 > 0.1 Counts
## 5 0.0009349437 0.0258125 < 0.1 Counts
## 6 0.5248783168 NA > 0.1 Counts
# Set ylim: has to adjusted by users depending on data
yl <- c(-7.5, 7.5)
# Set min log2 fold change of interest
mLog <- c(-1, 1)
# Initialize a list storing MA plots
MAList <- ddsList
# Create MA plots
for (i in shrinkage) {
both.df <- rbind(all.resList[[i]][[TvC[1]]],
all.resList[[i]][[TvC[2]]])
MAList[[i]] <- ggplot(both.df,
aes(x=baseMean, y=log2FoldChange, color=FDR)) +geom_point() + scale_x_log10() + facet_grid(~Input) +
theme_bw() +
scale_color_manual(values=c("blue", "grey")) +
ggtitle(paste("MA plot with", i)) +
ylim(yl[1], yl[2]) +
geom_hline(yintercept=c(mLog[1], mLog[2]), linetype="dashed", color="red")
}
# Print MA plots
MAList
## $TPM
## class: DESeqDataSet
## dim: 54347 6
## metadata(1): version
## assays(4): counts mu H cooks
## rownames(54347): ENSMUSG00000000001 ENSMUSG00000000003 ...
## ENSMUSG00000118658 ENSMUSG00000118659
## rowData names(22): baseMean baseVar ... deviance maxCooks
## colnames(6): WT-rep1 WT-rep2 ... cKO-rep2 cKO-rep3
## colData names(4): Sample Group Path sizeFactor
##
## $Counts
## class: DESeqDataSet
## dim: 54347 6
## metadata(1): version
## assays(6): counts avgTxLength ... H cooks
## rownames(54347): ENSMUSG00000000001 ENSMUSG00000000003 ...
## ENSMUSG00000118658 ENSMUSG00000118659
## rowData names(22): baseMean baseVar ... deviance maxCooks
## colnames(6): WT-rep1 WT-rep2 ... cKO-rep2 cKO-rep3
## colData names(3): Sample Group Path
##
## $None
##
## $Normal
##
## $Apeglm
##
## $Ashr
# Subset rows with FDR < alpha
both.df <- rbind(all.resList[[1]][['TPM']],
all.resList[[1]][['Counts']])
both.df$Input <- factor(both.df$Input, levels=TvC)
# Plot distribution of FDR
ggplot(both.df,
aes(x=padj, color=Input)) +
geom_density(size=1, aes(y=..count..)) +
theme_bw() +
ggtitle("Distribution of False Discovery Rate (FDR)") +
xlab("Adjusted P-Value (FDR)") +
ylab("Count") +
geom_vline(xintercept=alpha,
color="black",
linetype="dashed",
size=1) +
scale_x_continuous(breaks=seq(0, 1, by=0.1))
# Initialize a lfc list
lfcplotList <- all.resList
# Create lfc distribution plots
for (i in shrinkage) {
lfc.df <- rbind(all.resList[[i]][[TvC[1]]],
all.resList[[i]][[TvC[2]]])
lfc.df <- lfc.df[lfc.df$FDR == "< 0.1",]
lfc.df$Input <- factor(lfc.df$Input, levels=TvC)
lfcplotList[[i]] <- ggplot(lfc.df, # Subset rows with FDR < alpha
aes(x=log2FoldChange, color=Input)) +
geom_density(size=1, aes(y=..count..)) +
theme_bw() + ylab("Count") +
geom_vline(xintercept=c(mLog[1], mLog[2]),
color="black",
linetype="dashed",
size=1) +
ggtitle(paste("Distribution of Log2FoldChange by Input Type:", i)) +
xlim(-5, 5)
}
# Print the lfc plots
lfcplotList
## $None
##
## $Normal
##
## $Apeglm
##
## $Ashr
# Count number of NA genes
type=c("Zero Counts", "Outliers", "Total NA Genes")
# Create a data frame storing NA gene number
NAstat <- both.df %>%
group_by(Input) %>%
summarize(zero=sum(is.na(log2FoldChange)),
outlier=sum(is.na(pvalue) & is.na(padj))) %>%
mutate(total=zero + outlier) %>%
gather(Type, Number, -Input) %>%
mutate(Type=factor(case_when(Type == "zero" ~ type[1],
Type == "outlier" ~ type[2],
Type == "total" ~ type[3]),
levels=type))
# Plot number of NA genes
ggplot(NAstat,
aes(x=Type, y=Number, group=Input, fill=Input, label=Number)) +
geom_bar(stat="identity", position="dodge") +
theme_bw() +
geom_text(position=position_dodge(width=1), vjust=1.5) +
ggtitle("Number of NA Genes") +
ylab("Number of Genes")
# Create a new list having DE table with FDR below alpha
fdr.rank <- all.resList
lfc.rank <- all.resList
up.lfc.rank <- all.resList
down.lfc.rank <- all.resList
# Set a function cleaning a data frame
filter.fdr.fn <- function(df) {as.data.table(df[df$FDR == FDRv[1],])}
# Set a function creating a column for the rank
Ranking.fn <- function(x) {mutate(x, Rank=1:nrow(x))}
for (i in shrinkage) {
for (x in TvC) {
rankdf <- all.resList[[i]][[x]]
fdr.rank[[i]][[x]] <- filter.fdr.fn(rankdf) %>% arrange(padj) %>% Ranking.fn()
lfc.rank[[i]][[x]] <- filter.fdr.fn(rankdf) %>% arrange(desc(abs(log2FoldChange))) %>% Ranking.fn()
up.lfc.rank[[i]][[x]] <- filter.fdr.fn(rankdf) %>%
arrange(desc(log2FoldChange)) %>%
Ranking.fn()
down.lfc.rank[[i]][[x]] <- filter.fdr.fn(rankdf) %>%
arrange(log2FoldChange) %>%
Ranking.fn()
}
}
# Explore the ranking outputs
head(fdr.rank[[1]][[1]])
## Gene baseMean log2FoldChange lfcSE stat
## 1: ENSMUSG00000026463 1486.8997 -1.696086 0.1216619 -13.94098
## 2: ENSMUSG00000022018 637.3808 -1.277895 0.1155273 -11.06141
## 3: ENSMUSG00000032011 16059.7891 -1.909339 0.1840373 -10.37473
## 4: ENSMUSG00000040498 712.2468 -2.230573 0.2167892 -10.28913
## 5: ENSMUSG00000047180 1001.3846 -1.412164 0.1395955 -10.11611
## 6: ENSMUSG00000029468 1457.2985 -1.743485 0.1739533 -10.02272
## pvalue padj FDR Input Rank
## 1: 3.570101e-44 4.829276e-40 < 0.1 TPM 1
## 2: 1.930438e-28 1.305652e-24 < 0.1 TPM 2
## 3: 3.231112e-25 1.456908e-21 < 0.1 TPM 3
## 4: 7.888262e-25 2.667613e-21 < 0.1 TPM 4
## 5: 4.686796e-24 1.267966e-20 < 0.1 TPM 5
## 6: 1.211213e-23 2.730679e-20 < 0.1 TPM 6
head(fdr.rank[[1]][[2]])
## Gene baseMean log2FoldChange lfcSE stat
## 1: ENSMUSG00000026463 1495.3259 -1.697168 0.1219119 -13.92127
## 2: ENSMUSG00000022018 641.6187 -1.276915 0.1151204 -11.09200
## 3: ENSMUSG00000032011 16149.4069 -1.909044 0.1840118 -10.37457
## 4: ENSMUSG00000040498 716.4666 -2.229149 0.2156284 -10.33792
## 5: ENSMUSG00000047180 1007.8261 -1.412092 0.1391644 -10.14693
## 6: ENSMUSG00000029468 1456.8887 -1.743020 0.1738787 -10.02434
## pvalue padj FDR Input Rank
## 1: 4.704485e-44 6.637087e-40 < 0.1 Counts 1
## 2: 1.371801e-28 9.676681e-25 < 0.1 Counts 2
## 3: 3.236503e-25 1.522019e-21 < 0.1 Counts 3
## 4: 4.747503e-25 1.674444e-21 < 0.1 Counts 4
## 5: 3.419387e-24 9.648143e-21 < 0.1 Counts 5
## 6: 1.191522e-23 2.801665e-20 < 0.1 Counts 6
head(lfc.rank[[1]][[1]])
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 1: ENSMUSG00000102298 85.84981 9.904294 1.2072556 8.203975 2.325671e-16
## 2: ENSMUSG00000103587 27.65904 8.271554 1.2479720 6.627997 3.402732e-11
## 3: ENSMUSG00000081228 17.89201 -7.573484 1.2876161 -5.881787 4.058600e-09
## 4: ENSMUSG00000026616 42.71798 -5.132552 1.7434396 -2.943923 3.240807e-03
## 5: ENSMUSG00000052105 21.86338 4.433258 0.8809757 5.032213 4.848491e-07
## 6: ENSMUSG00000025929 39.39414 -3.504122 0.7724056 -4.536635 5.715904e-06
## padj FDR Input Rank
## 1: 2.621612e-13 < 0.1 TPM 1
## 2: 1.438399e-08 < 0.1 TPM 2
## 3: 1.098014e-06 < 0.1 TPM 3
## 4: 6.200622e-02 < 0.1 TPM 4
## 5: 6.760611e-05 < 0.1 TPM 5
## 6: 4.924779e-04 < 0.1 TPM 6
head(lfc.rank[[1]][[2]])
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 1: ENSMUSG00000102298 86.40658 9.912934 1.206247 8.217994 2.069355e-16
## 2: ENSMUSG00000103587 28.01962 8.289688 1.244466 6.661242 2.715225e-11
## 3: ENSMUSG00000081228 17.90003 -7.565160 1.283233 -5.895388 3.738009e-09
## 4: ENSMUSG00000091071 92.97825 -6.586493 2.094340 -3.144902 1.661426e-03
## 5: ENSMUSG00000026616 22.99255 -5.134312 1.433372 -3.581982 3.409973e-04
## 6: ENSMUSG00000052105 12.95269 4.486248 0.851751 5.267089 1.386040e-07
## padj FDR Input Rank
## 1: 2.432872e-13 < 0.1 Counts 1
## 2: 1.197075e-08 < 0.1 Counts 2
## 3: 1.054717e-06 < 0.1 Counts 3
## 4: 3.952681e-02 < 0.1 Counts 4
## 5: 1.276072e-02 < 0.1 Counts 5
## 6: 2.300501e-05 < 0.1 Counts 6
head(up.lfc.rank[[1]][[1]])
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 1: ENSMUSG00000102298 85.84981 9.904294 1.2072556 8.203975 2.325671e-16
## 2: ENSMUSG00000103587 27.65904 8.271554 1.2479720 6.627997 3.402732e-11
## 3: ENSMUSG00000052105 21.86338 4.433258 0.8809757 5.032213 4.848491e-07
## 4: ENSMUSG00000027860 17.78000 2.864749 0.9491109 3.018351 2.541545e-03
## 5: ENSMUSG00000067599 51.07036 2.720996 0.5866686 4.638046 3.517192e-06
## 6: ENSMUSG00000049881 39.64867 2.712169 0.7863516 3.449054 5.625549e-04
## padj FDR Input Rank
## 1: 2.621612e-13 < 0.1 TPM 1
## 2: 1.438399e-08 < 0.1 TPM 2
## 3: 6.760611e-05 < 0.1 TPM 3
## 4: 5.232798e-02 < 0.1 TPM 4
## 5: 3.258702e-04 < 0.1 TPM 5
## 6: 1.752513e-02 < 0.1 TPM 6
head(up.lfc.rank[[1]][[2]])
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 1: ENSMUSG00000102298 86.40658 9.912934 1.2062475 8.217994 2.069355e-16
## 2: ENSMUSG00000103587 28.01962 8.289688 1.2444658 6.661242 2.715225e-11
## 3: ENSMUSG00000052105 12.95269 4.486248 0.8517510 5.267089 1.386040e-07
## 4: ENSMUSG00000085328 12.82875 3.508424 1.2470414 2.813398 4.902089e-03
## 5: ENSMUSG00000094825 12.92533 3.215367 0.9713344 3.310257 9.321022e-04
## 6: ENSMUSG00000043932 14.95823 3.169154 0.8873769 3.571373 3.551149e-04
## padj FDR Input Rank
## 1: 2.432872e-13 < 0.1 Counts 1
## 2: 1.197075e-08 < 0.1 Counts 2
## 3: 2.300501e-05 < 0.1 Counts 3
## 4: 8.332370e-02 < 0.1 Counts 4
## 5: 2.578450e-02 < 0.1 Counts 5
## 6: 1.308084e-02 < 0.1 Counts 6
head(down.lfc.rank[[1]][[1]])
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 1: ENSMUSG00000081228 17.89201 -7.573484 1.2876161 -5.881787 4.058600e-09
## 2: ENSMUSG00000026616 42.71798 -5.132552 1.7434396 -2.943923 3.240807e-03
## 3: ENSMUSG00000025929 39.39414 -3.504122 0.7724056 -4.536635 5.715904e-06
## 4: ENSMUSG00000038305 27.67164 -3.315884 0.9145036 -3.625884 2.879744e-04
## 5: ENSMUSG00000037129 21.70495 -2.996111 0.6636694 -4.514463 6.347751e-06
## 6: ENSMUSG00000030769 61.98969 -2.895925 0.7804593 -3.710539 2.068186e-04
## padj FDR Input Rank
## 1: 1.098014e-06 < 0.1 TPM 1
## 2: 6.200622e-02 < 0.1 TPM 2
## 3: 4.924779e-04 < 0.1 TPM 3
## 4: 1.088098e-02 < 0.1 TPM 4
## 5: 5.400379e-04 < 0.1 TPM 5
## 6: 8.452072e-03 < 0.1 TPM 6
head(down.lfc.rank[[1]][[2]])
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 1: ENSMUSG00000081228 17.90003 -7.565160 1.2832335 -5.895388 3.738009e-09
## 2: ENSMUSG00000091071 92.97825 -6.586493 2.0943398 -3.144902 1.661426e-03
## 3: ENSMUSG00000026616 22.99255 -5.134312 1.4333718 -3.581982 3.409973e-04
## 4: ENSMUSG00000017002 13.01577 -3.816359 1.0324219 -3.696511 2.185830e-04
## 5: ENSMUSG00000096349 14.54338 -3.639189 1.2724638 -2.859955 4.237012e-03
## 6: ENSMUSG00000025929 39.64370 -3.514489 0.7624824 -4.609272 4.040815e-06
## padj FDR Input Rank
## 1: 1.054717e-06 < 0.1 Counts 1
## 2: 3.952681e-02 < 0.1 Counts 2
## 3: 1.276072e-02 < 0.1 Counts 3
## 4: 9.096662e-03 < 0.1 Counts 4
## 5: 7.500097e-02 < 0.1 Counts 5
## 6: 3.878082e-04 < 0.1 Counts 6
# Set a function rebuilding DE tables with gene ranks
rankdiff.fn <- function(List){
# Select columns and join the data frames by gene
full_join(List[[TvC[1]]][, .(Gene, Input, Rank, baseMean)],
List[[TvC[2]]][, .(Gene, Input, Rank, baseMean)],
by="Gene") %>%
# Add columns assining gene expression levels, rank differences, and mean ranks
mutate(logMeanExpression=log(baseMean.x+baseMean.y/2),
RankDiff=Rank.x-Rank.y,
MeanRank=(Rank.x+Rank.y)/2)
}
# Set a function adding Shrinkage column
add.shr.fn <- function(df, shr) {mutate(df, Shrinkage=shr)}
# Set a function rbinding multiple data frames
rbinds.fn <- function(List) {rbind(List[[1]],
List[[2]],
List[[3]],
List[[4]])}
# Calculate and plot rank difference
for (i in shrinkage) {
# Calculate rank difference
fdr.rank[[i]] <- rankdiff.fn(fdr.rank[[i]]) %>% add.shr.fn(i)
lfc.rank[[i]] <- rankdiff.fn(lfc.rank[[i]]) %>% add.shr.fn(i)
up.lfc.rank[[i]] <- rankdiff.fn(up.lfc.rank[[i]]) %>% add.shr.fn(i)
down.lfc.rank[[i]] <- rankdiff.fn(down.lfc.rank[[i]]) %>% add.shr.fn(i)
}
# Create combined data frames across the shrinkages
fdr.rank.df <- rbinds.fn(fdr.rank)
lfc.rank.df <- rbinds.fn(lfc.rank)
up.lfc.rank.df <- rbinds.fn(up.lfc.rank)
down.lfc.rank.df <- rbinds.fn(down.lfc.rank)
# Explore the ranking outputs
head(fdr.rank.df)
## Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSMUSG00000026463 TPM 1 1486.8997 Counts 1 1495.3259
## 2: ENSMUSG00000022018 TPM 2 637.3808 Counts 2 641.6187
## 3: ENSMUSG00000032011 TPM 3 16059.7891 Counts 3 16149.4069
## 4: ENSMUSG00000040498 TPM 4 712.2468 Counts 4 716.4666
## 5: ENSMUSG00000047180 TPM 5 1001.3846 Counts 5 1007.8261
## 6: ENSMUSG00000029468 TPM 6 1457.2985 Counts 6 1456.8887
## logMeanExpression RankDiff MeanRank Shrinkage
## 1: 7.711801 0 1 None
## 2: 6.865046 0 2 None
## 3: 10.091397 0 3 None
## 4: 6.975862 0 4 None
## 5: 7.316746 0 5 None
## 6: 7.689711 0 6 None
head(lfc.rank.df)
## Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSMUSG00000102298 TPM 1 85.84981 Counts 1 86.40658
## 2: ENSMUSG00000103587 TPM 2 27.65904 Counts 2 28.01962
## 3: ENSMUSG00000081228 TPM 3 17.89201 Counts 3 17.90003
## 4: ENSMUSG00000026616 TPM 4 42.71798 Counts 5 22.99255
## 5: ENSMUSG00000052105 TPM 5 21.86338 Counts 6 12.95269
## 6: ENSMUSG00000025929 TPM 6 39.39414 Counts 9 39.64370
## logMeanExpression RankDiff MeanRank Shrinkage
## 1: 4.860224 0 1.0 None
## 2: 3.729754 0 2.0 None
## 3: 3.289969 0 3.0 None
## 4: 3.992944 -1 4.5 None
## 5: 3.344265 -1 5.5 None
## 6: 4.081192 -3 7.5 None
head(up.lfc.rank.df)
## Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSMUSG00000102298 TPM 1 85.84981 Counts 1 86.40658
## 2: ENSMUSG00000103587 TPM 2 27.65904 Counts 2 28.01962
## 3: ENSMUSG00000052105 TPM 3 21.86338 Counts 3 12.95269
## 4: ENSMUSG00000027860 TPM 4 17.78000 <NA> NA NA
## 5: ENSMUSG00000067599 TPM 5 51.07036 Counts 8 51.54967
## 6: ENSMUSG00000049881 TPM 6 39.64867 Counts 9 31.82926
## logMeanExpression RankDiff MeanRank Shrinkage
## 1: 4.860224 0 1.0 None
## 2: 3.729754 0 2.0 None
## 3: 3.344265 0 3.0 None
## 4: NA NA NA None
## 5: 4.341793 -3 6.5 None
## 6: 4.017523 -3 7.5 None
head(down.lfc.rank.df)
## Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSMUSG00000081228 TPM 1 17.89201 Counts 1 17.90003
## 2: ENSMUSG00000026616 TPM 2 42.71798 Counts 3 22.99255
## 3: ENSMUSG00000025929 TPM 3 39.39414 Counts 6 39.64370
## 4: ENSMUSG00000038305 TPM 4 27.67164 Counts 7 22.49073
## 5: ENSMUSG00000037129 TPM 5 21.70495 Counts 8 21.76394
## 6: ENSMUSG00000030769 TPM 6 61.98969 Counts 10 51.13445
## logMeanExpression RankDiff MeanRank Shrinkage
## 1: 3.289969 0 1.0 None
## 2: 3.992944 -1 2.5 None
## 3: 4.081192 -3 4.5 None
## 4: 3.661431 -3 5.5 None
## 5: 3.483911 -3 6.5 None
## 6: 4.472289 -4 8.0 None
# Set a function plotting gene ranks between TPM- (x-axis) and Counts-Inputs (y-axis)
ranking.plot.fn <- function(df, rankedby) {
df$Shrinkage <- factor(df$Shrinkage, levels=shrinkage)
ggplot(df,
aes(x=Rank.x, y=Rank.y, color=logMeanExpression)) + geom_point(alpha=0.5) + facet_grid(~Shrinkage) + theme_bw() + theme(strip.text.x=element_text(size=10)) + xlab("Rank with TPM") + ylab("Rank with Counts") + geom_abline(slope=1, color="black", size=0.5) + ggtitle(paste(rankedby, "Ranking with TPM vs Count Inputs")) + scale_color_gradient(low="blue", high="red")
}
# Print output plots
ranking.plot.fn(fdr.rank.df, "FDR")
ranking.plot.fn(lfc.rank.df, "Log2FoldChange")
ranking.plot.fn(up.lfc.rank.df, "Log2FoldChange (Increase)")
ranking.plot.fn(down.lfc.rank.df, "Log2FoldChange (Decrease)")
# Set a function plotting the rank difference over the gene expression level
rankdiff.plot.fn <- function(df, rankedby, ylim) {
df$Shrinkage <- factor(df$Shrinkage, levels=shrinkage)
ggplot(df, aes(x=logMeanExpression, y=RankDiff, color=MeanRank)) +
geom_point(alpha=0.5) +
theme_bw() +
facet_grid(~Shrinkage) +
theme(strip.text.x=element_text(size=10)) +
ylab("Rank Difference (TPM - Count)") +
ggtitle(paste("Rank Difference (TPM - Count):", rankedby)) +
geom_hline(yintercept=0, color="black", size=0.5) + scale_color_gradient(low="blue", high="red")
}
# Print output plots
rankdiff.plot.fn(fdr.rank.df, "FDR")
rankdiff.plot.fn(lfc.rank.df, "Log2FoldChange")
rankdiff.plot.fn(up.lfc.rank.df, "Log2FoldChange (Increase)")
rankdiff.plot.fn(down.lfc.rank.df, "Log2FoldChange (Decrease)")
# Set a function subsetting unshrunken data
unshr.rankdiff.fn <- function(df) {subset(df, Shrinkage == "None")}
# Create a list storing rankdiff data frames
rankList <- list(unshr.rankdiff.fn(fdr.rank.df),
unshr.rankdiff.fn(lfc.rank.df),
unshr.rankdiff.fn(up.lfc.rank.df),
unshr.rankdiff.fn(down.lfc.rank.df))
# Assine result column as a factor with levels
rankdiff.levels <- c("FDR",
"log2FoldChange",
"log2FoldChange.Increase",
"log2FoldChange.Decrease")
# Name the list
names(rankList) <- rankdiff.levels
# Create a new data frame storing rank difference by result type
rankdiff.dist <- data.frame(FDR=abs(rankList[[1]]$RankDiff),
log2FoldChange=abs(rankList[[2]]$RankDiff),
log2FoldChange.Increase=abs(rankList[[3]]$RankDiff),
log2FoldChange.Decrease=abs(rankList[[4]]$RankDiff)) %>% gather(Result, RankDiff)
rankdiff.dist$Result <- factor(rankdiff.dist$Result, levels=rankdiff.levels)
# Plot distribution of absolute rank difference
ggplot(rankdiff.dist,
aes(x=Result, y=RankDiff, color=Result)) +
geom_jitter(alpha=0.5) +
geom_boxplot(alpha=0.5, fill="grey", color="black") +
theme_bw() +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
ggtitle("Distribution of Absolute Rank Difference without Shrinkage \n(TPM - Count Input)") +
ylab("Absolute Rank Difference (TPM - Count Input)")
# Create a data frame storing the number of transcripts by gene id
AnnoDb.ntrans <- AnnoDb %>%
group_by(GENEID) %>%
summarize(num.trans=n_distinct(TXID))
# Set a function adding the number of transcripts by gene id
add.ntrans.fn <- function(df) {
left_join(df, AnnoDb.ntrans, by=c("Gene"="GENEID"))}
# Add a column indicating the number of transcripts by gene id to every rankdiff data frame
for (x in rankdiff.levels) {
rankList[[x]] <- add.ntrans.fn(rankList[[x]])
}
# Explore the edited data frames
summary(rankList)
## Length Class Mode
## FDR 12 data.table list
## log2FoldChange 12 data.table list
## log2FoldChange.Increase 12 data.table list
## log2FoldChange.Decrease 12 data.table list
head(rankList[[1]])
## Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSMUSG00000026463 TPM 1 1486.8997 Counts 1 1495.3259
## 2: ENSMUSG00000022018 TPM 2 637.3808 Counts 2 641.6187
## 3: ENSMUSG00000032011 TPM 3 16059.7891 Counts 3 16149.4069
## 4: ENSMUSG00000040498 TPM 4 712.2468 Counts 4 716.4666
## 5: ENSMUSG00000047180 TPM 5 1001.3846 Counts 5 1007.8261
## 6: ENSMUSG00000029468 TPM 6 1457.2985 Counts 6 1456.8887
## logMeanExpression RankDiff MeanRank Shrinkage num.trans
## 1: 7.711801 0 1 None 8
## 2: 6.865046 0 2 None 1
## 3: 10.091397 0 3 None 6
## 4: 6.975862 0 4 None 4
## 5: 7.316746 0 5 None 2
## 6: 7.689711 0 6 None 8
head(rankList[[2]])
## Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSMUSG00000102298 TPM 1 85.84981 Counts 1 86.40658
## 2: ENSMUSG00000103587 TPM 2 27.65904 Counts 2 28.01962
## 3: ENSMUSG00000081228 TPM 3 17.89201 Counts 3 17.90003
## 4: ENSMUSG00000026616 TPM 4 42.71798 Counts 5 22.99255
## 5: ENSMUSG00000052105 TPM 5 21.86338 Counts 6 12.95269
## 6: ENSMUSG00000025929 TPM 6 39.39414 Counts 9 39.64370
## logMeanExpression RankDiff MeanRank Shrinkage num.trans
## 1: 4.860224 0 1.0 None 1
## 2: 3.729754 0 2.0 None 1
## 3: 3.289969 0 3.0 None 1
## 4: 3.992944 -1 4.5 None 13
## 5: 3.344265 -1 5.5 None 10
## 6: 4.081192 -3 7.5 None 1
head(rankList[[3]])
## Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSMUSG00000102298 TPM 1 85.84981 Counts 1 86.40658
## 2: ENSMUSG00000103587 TPM 2 27.65904 Counts 2 28.01962
## 3: ENSMUSG00000052105 TPM 3 21.86338 Counts 3 12.95269
## 4: ENSMUSG00000027860 TPM 4 17.78000 <NA> NA NA
## 5: ENSMUSG00000067599 TPM 5 51.07036 Counts 8 51.54967
## 6: ENSMUSG00000049881 TPM 6 39.64867 Counts 9 31.82926
## logMeanExpression RankDiff MeanRank Shrinkage num.trans
## 1: 4.860224 0 1.0 None 1
## 2: 3.729754 0 2.0 None 1
## 3: 3.344265 0 3.0 None 10
## 4: NA NA NA None 8
## 5: 4.341793 -3 6.5 None 4
## 6: 4.017523 -3 7.5 None 4
head(rankList[[4]])
## Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSMUSG00000081228 TPM 1 17.89201 Counts 1 17.90003
## 2: ENSMUSG00000026616 TPM 2 42.71798 Counts 3 22.99255
## 3: ENSMUSG00000025929 TPM 3 39.39414 Counts 6 39.64370
## 4: ENSMUSG00000038305 TPM 4 27.67164 Counts 7 22.49073
## 5: ENSMUSG00000037129 TPM 5 21.70495 Counts 8 21.76394
## 6: ENSMUSG00000030769 TPM 6 61.98969 Counts 10 51.13445
## logMeanExpression RankDiff MeanRank Shrinkage num.trans
## 1: 3.289969 0 1.0 None 1
## 2: 3.992944 -1 2.5 None 13
## 3: 4.081192 -3 4.5 None 1
## 4: 3.661431 -3 5.5 None 15
## 5: 3.483911 -3 6.5 None 1
## 6: 4.472289 -4 8.0 None 8
# Set a function plotting rank difference vs number of transcripts
rank.ntrans.plot.fn <- function(df, title) {
ggplot(df, aes(x=num.trans, y=abs(RankDiff), color=MeanRank)) +
geom_jitter(alpha=0.5) +
theme_bw() +
ggtitle(paste("Rank Difference vs Number of Alternative Transcripts \nin", title)) +
xlab("Number of Alternative Transcripts") +
ylab("Absolute Rank Difference (TPM - Counts)") + scale_color_gradient(low="blue", high="red")
}
# Print the plots
rank.ntrans.plot.fn(rankList[[1]], "FDR")
rank.ntrans.plot.fn(rankList[[2]], "log2FoldChange")
rank.ntrans.plot.fn(rankList[[3]], "log2FoldChange (Increase)")
rank.ntrans.plot.fn(rankList[[4]], "log2FoldChange (Decrease)")
# Initialize a list storing rankdiff genes
large.rankdiff <- rankList
# Assign a vector storing minimum (thresholds) rankdiff for filtering large rankdiff genes
rankdiff.thr <- c(10, 10, 10, 10)
names(rankdiff.thr) <- rankdiff.levels
for (x in rankdiff.levels) {
# Filter out observations below the rankdiff thresholds
large.rankdiff[[x]] <- subset(rankList[[x]],
abs(RankDiff) > rankdiff.thr[x])
}
# Explore the filtered genes
summary(large.rankdiff)
## Length Class Mode
## FDR 12 data.table list
## log2FoldChange 12 data.table list
## log2FoldChange.Increase 12 data.table list
## log2FoldChange.Decrease 12 data.table list
dim(large.rankdiff[[rankdiff.levels[1]]])
## [1] 364 12
dim(large.rankdiff[[rankdiff.levels[2]]])
## [1] 713 12
dim(large.rankdiff[[rankdiff.levels[3]]])
## [1] 5 12
dim(large.rankdiff[[rankdiff.levels[4]]])
## [1] 13 12
# Set a function saving rankdiff.csv files
saving.fn <- function(input.df, data.type) {
filename <- paste0("tpmVScount_", data.type, "_RankDiff.csv")
return(write.csv(input.df, filename))
}
# Save the filtered and arranged data frames as csv files
saving.fn(large.rankdiff[[rankdiff.levels[1]]], "FDR")
saving.fn(large.rankdiff[[rankdiff.levels[2]]], "LFC")
saving.fn(large.rankdiff[[rankdiff.levels[3]]], "LFC_Increase")
saving.fn(large.rankdiff[[rankdiff.levels[4]]], "LFC_Decrease")
# Set a function cleaning data to generate upset plots
upset.input.fn <- function(df) {
df <- df %>%
# Filter genes with valid padj
subset(!is.na(padj)) %>%
mutate(Up=ifelse(FDR == FDRv[1] & log2FoldChange > 0, Gene, ""), # What are upregulated genes?
Down=ifelse(FDR == FDRv[1] & log2FoldChange < 0, Gene, ""), # What are downregulated genes?
Unchanged=ifelse(FDR == FDRv[2], Gene, ""), # What are unchanged genes?
TPM_Input=ifelse(Input == "TPM", Gene, ""), # What are the genes from TPM input?
Counts_Input=ifelse(Input == "Counts", Gene, "")) # What are the genes from Counts input?
# Create a list storing groups of interest
upsetInput <- list(Up=df$Up,
Down=df$Down,
Unchanged=df$Unchanged,
TPM_Input=df$TPM,
Counts_Input=df$Counts)
return(upsetInput)
}
upsetList <- upset.input.fn(both.df)
# Create the upset plot
upset(fromList(upsetList),
sets=names(upsetList), # What group to display
sets.x.label="Number of Genes per Group",
order.by="freq",
point.size=3,
sets.bar.color=c("red", "red","blue", "blue", "blue"),
text.scale = 1.5, number.angles=30)
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.1 LTS
##
## Matrix products: default
## BLAS/LAPACK: /home/mira/miniconda3/envs/r/lib/libopenblasp-r0.3.10.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] ashr_2.2-47 apeglm_1.12.0
## [3] ensembldb_2.14.0 AnnotationFilter_1.14.0
## [5] GenomicFeatures_1.42.1 AnnotationDbi_1.52.0
## [7] UpSetR_1.4.0 pheatmap_1.0.12
## [9] gridExtra_2.3 DESeq2_1.30.0
## [11] SummarizedExperiment_1.20.0 Biobase_2.50.0
## [13] MatrixGenerics_1.2.0 matrixStats_0.57.0
## [15] GenomicRanges_1.42.0 GenomeInfoDb_1.26.2
## [17] IRanges_2.24.0 S4Vectors_0.28.1
## [19] tximport_1.18.0 forcats_0.5.0
## [21] stringr_1.4.0 dplyr_1.0.2
## [23] purrr_0.3.4 readr_1.4.0
## [25] tidyr_1.1.2 tibble_3.0.4
## [27] ggplot2_3.3.2 tidyverse_1.3.0
## [29] AnnotationHub_2.22.0 BiocFileCache_1.14.0
## [31] dbplyr_2.0.0 BiocGenerics_0.36.0
## [33] rmarkdown_2.5 data.table_1.13.4
##
## loaded via a namespace (and not attached):
## [1] readxl_1.3.1 backports_1.2.1
## [3] plyr_1.8.6 lazyeval_0.2.2
## [5] splines_4.0.3 BiocParallel_1.24.1
## [7] digest_0.6.27 invgamma_1.1
## [9] htmltools_0.5.0 SQUAREM_2020.5
## [11] fansi_0.4.1 magrittr_2.0.1
## [13] memoise_1.1.0 Biostrings_2.58.0
## [15] annotate_1.68.0 modelr_0.1.8
## [17] askpass_1.1 bdsmatrix_1.3-4
## [19] prettyunits_1.1.1 colorspace_2.0-0
## [21] blob_1.2.1 rvest_0.3.6
## [23] rappdirs_0.3.1 haven_2.3.1
## [25] xfun_0.19 crayon_1.3.4
## [27] RCurl_1.98-1.2 jsonlite_1.7.2
## [29] genefilter_1.72.0 survival_3.2-7
## [31] glue_1.4.2 gtable_0.3.0
## [33] zlibbioc_1.36.0 XVector_0.30.0
## [35] DelayedArray_0.16.0 scales_1.1.1
## [37] mvtnorm_1.1-1 DBI_1.1.0
## [39] Rcpp_1.0.5 xtable_1.8-4
## [41] progress_1.2.2 emdbook_1.3.12
## [43] bit_4.0.4 truncnorm_1.0-8
## [45] httr_1.4.2 RColorBrewer_1.1-2
## [47] ellipsis_0.3.1 farver_2.0.3
## [49] pkgconfig_2.0.3 XML_3.99-0.5
## [51] locfit_1.5-9.4 labeling_0.4.2
## [53] tidyselect_1.1.0 rlang_0.4.9
## [55] later_1.1.0.1 munsell_0.5.0
## [57] BiocVersion_3.12.0 cellranger_1.1.0
## [59] tools_4.0.3 cli_2.2.0
## [61] generics_0.1.0 RSQLite_2.2.1
## [63] broom_0.7.2 evaluate_0.14
## [65] fastmap_1.0.1 yaml_2.2.1
## [67] knitr_1.30 bit64_4.0.5
## [69] fs_1.5.0 mime_0.9
## [71] xml2_1.3.2 biomaRt_2.46.0
## [73] compiler_4.0.3 rstudioapi_0.13
## [75] curl_4.3 interactiveDisplayBase_1.28.0
## [77] reprex_0.3.0 geneplotter_1.68.0
## [79] stringi_1.5.3 ps_1.5.0
## [81] lattice_0.20-41 ProtGenerics_1.22.0
## [83] Matrix_1.2-18 vctrs_0.3.5
## [85] pillar_1.4.7 lifecycle_0.2.0
## [87] BiocManager_1.30.10 bitops_1.0-6
## [89] irlba_2.3.3 httpuv_1.5.4
## [91] rtracklayer_1.50.0 R6_2.5.0
## [93] promises_1.1.1 MASS_7.3-53
## [95] assertthat_0.2.1 openssl_1.4.3
## [97] withr_2.3.0 GenomicAlignments_1.26.0
## [99] Rsamtools_2.6.0 GenomeInfoDbData_1.2.4
## [101] hms_0.5.3 grid_4.0.3
## [103] coda_0.19-4 mixsqp_0.3-43
## [105] bbmle_1.0.23.1 numDeriv_2016.8-1.1
## [107] shiny_1.5.0 lubridate_1.7.9.2